#read in data
mailchimp_users_tbl <- read_rds(here("data","mailchimp_users.rds"))
mailchimp_users_tbl %>% glimpse()
## Rows: 23,672
## Columns: 10
## $ euid <chr> "000b09860b", "000d76e3f8", "0010ca98c6", "00129d76f9", …
## $ leid <chr> "308851471", "71513779", "62999259", "68149639", "708797…
## $ member_rating <int> 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
## $ optin_time <date> 2019-10-16, 2019-05-22, 2018-11-19, 2019-02-27, 2019-05…
## $ confirm_time <date> 2019-10-16, 2019-05-22, 2018-11-19, 2019-02-27, 2019-05…
## $ country_code <chr> "us", "in", "lu", "us", NA, NA, NA, "in", NA, "ve", "mx"…
## $ region <chr> "fl", "tn", "lu", "ca", NA, NA, NA, "as", NA, "a", "jal"…
## $ last_changed <date> 2019-10-21, 2020-01-09, 2019-10-21, 2020-01-09, 2020-01…
## $ notes <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ date_added <date> 2020-03-02, 2020-03-02, 2020-03-02, 2020-03-02, 2020-03…
#counting number of events per day
#daily summary
optins_day_tbl <- mailchimp_users_tbl %>%
summarise_by_time(
.date_var = optin_time,
.by = "day",
optins = n())
optins_day_tbl%>% head()
## # A tibble: 6 × 2
## optin_time optins
## <date> <int>
## 1 2018-06-08 1
## 2 2018-07-03 10
## 3 2018-07-04 15
## 4 2018-07-05 9
## 5 2018-07-06 11
## 6 2018-07-07 4
#weekly summary
mailchimp_users_tbl %>%
summarise_by_time(
.date_var = optin_time,
.by = "week",
optins = n()) %>% head()
## # A tibble: 6 × 2
## optin_time optins
## <date> <int>
## 1 2018-06-03 1
## 2 2018-07-01 49
## 3 2018-07-08 73
## 4 2018-07-15 62
## 5 2018-07-22 51
## 6 2018-07-29 52
#monthly summary
mailchimp_users_tbl %>%
summarise_by_time(
.date_var = optin_time,
.by = "month",
optins = n()) %>% head()
## # A tibble: 6 × 2
## optin_time optins
## <date> <int>
## 1 2018-06-01 1
## 2 2018-07-01 254
## 3 2018-08-01 210
## 4 2018-09-01 236
## 5 2018-10-01 267
## 6 2018-11-01 3955
index, the date or date-time column is called
index
units: the description of a single timestamp within
a time series
scale: the most common difference between timestamps
within a time series. It is also called interval,
frequency, period or
periodicity
Differencey Summary
scale (interval) between time stamps
in secondsoptins_day_tbl is irregular time sereis. Need to
fill in gaps (i.e., missing days) before conducting the
analysis
#notice diff.mean
optins_day_tbl %>% tk_summary_diagnostics(.date_var = optin_time)
## # A tibble: 1 × 12
## n.obs start end units scale tzone diff.minimum diff.q1 diff.median
## <int> <date> <date> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 608 2018-06-08 2020-03-02 days day UTC 86400 86400 86400
## # ℹ 3 more variables: diff.mean <dbl>, diff.q3 <dbl>, diff.maximum <dbl>
regular time series.optins_day_tbl %>%
pad_by_time(
.date_var = optin_time
) %>% head()
## pad applied on the interval: day
## # A tibble: 6 × 2
## optin_time optins
## <date> <int>
## 1 2018-06-08 1
## 2 2018-06-09 NA
## 3 2018-06-10 NA
## 4 2018-06-11 NA
## 5 2018-06-12 NA
## 6 2018-06-13 NA
subscribers_daily_tbl <- mailchimp_users_tbl %>%
summarise_by_time(
.date_var = optin_time,
.by = "day",
optins = n()
) %>%
pad_by_time(.by="day", .pad_value = 0)
## .date_var is missing. Using: optin_time
subscribers_daily_tbl %>% head()
## # A tibble: 6 × 2
## optin_time optins
## <date> <int>
## 1 2018-06-08 1
## 2 2018-06-09 0
## 3 2018-06-10 0
## 4 2018-06-11 0
## 5 2018-06-12 0
## 6 2018-06-13 0
#notice diff.mean
subscribers_daily_tbl %>% tk_summary_diagnostics(.date_var = optin_time)
## # A tibble: 1 × 12
## n.obs start end units scale tzone diff.minimum diff.q1 diff.median
## <int> <date> <date> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 634 2018-06-08 2020-03-02 days day UTC 86400 86400 86400
## # ℹ 3 more variables: diff.mean <dbl>, diff.q3 <dbl>, diff.maximum <dbl>
subscribers_daily_tbl %>%
plot_time_series(.date_var = optin_time, .value = optins)
subscribers_daily_tbl %>%
plot_anomaly_diagnostics(
.date_var = optin_time,
.value = optins,
.alpha = 0.01
)
## frequency = 7 observations per 1 week
## trend = 92 observations per 3 months
subscribers_daily_tbl %>%
plot_time_series(optin_time, log(optins +1))
log1p) is absolutely critical in
identifying lags and using lags in models.
subscribers_daily_tbl %>%
plot_acf_diagnostics(optin_time,
optins,
.lags = 100)
subscribers_daily_tbl %>%
plot_acf_diagnostics(optin_time, log(optins+1))
## Max lag exceeds data available. Using max lag: 633
subscribers_daily_tbl %>% tk_stl_diagnostics(
.date_var = optin_time,
.value = optins
)
## frequency = 7 observations per 1 week
## trend = 92 observations per 3 months
## # A tibble: 634 × 6
## optin_time observed season trend remainder seasadj
## <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2018-06-08 1 -0.121 0.248 0.873 1.12
## 2 2018-06-09 0 -5.10 0.415 4.68 5.10
## 3 2018-06-10 0 -3.32 0.583 2.74 3.32
## 4 2018-06-11 0 2.40 0.750 -3.15 -2.40
## 5 2018-06-12 0 2.68 0.918 -3.59 -2.68
## 6 2018-06-13 0 3.08 1.09 -4.16 -3.08
## 7 2018-06-14 0 0.390 1.25 -1.64 -0.390
## 8 2018-06-15 0 -0.121 1.42 -1.30 0.121
## 9 2018-06-16 0 -5.10 1.59 3.51 5.10
## 10 2018-06-17 0 -3.32 1.76 1.57 3.32
## # ℹ 624 more rows
subscribers_daily_tbl %>%
plot_time_series_regression(
.date_var = optin_time,
.formula = optins ~ as.numeric(optin_time) +
wday(optin_time, label = TRUE) +
month(optin_time, label = TRUE),
.show_summary = TRUE
)
##
## Call:
## stats::lm(formula = .formula, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -108.2 -29.5 -7.2 5.6 3210.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -880.48792 590.84981 -1.490 0.1367
## as.numeric(optin_time) 0.05094 0.03281 1.553 0.1211
## wday(optin_time, label = TRUE).L -17.56395 14.70202 -1.195 0.2327
## wday(optin_time, label = TRUE).Q -32.03078 14.73770 -2.173 0.0301 *
## wday(optin_time, label = TRUE).C 14.23163 14.71486 0.967 0.3338
## wday(optin_time, label = TRUE)^4 -10.93010 14.69951 -0.744 0.4574
## wday(optin_time, label = TRUE)^5 24.05748 14.72539 1.634 0.1028
## wday(optin_time, label = TRUE)^6 -15.50517 14.75169 -1.051 0.2936
## month(optin_time, label = TRUE).L 3.47389 19.23763 0.181 0.8568
## month(optin_time, label = TRUE).Q 16.88664 19.78001 0.854 0.3936
## month(optin_time, label = TRUE).C -3.57288 20.14312 -0.177 0.8593
## month(optin_time, label = TRUE)^4 -31.83980 19.44091 -1.638 0.1020
## month(optin_time, label = TRUE)^5 -9.52191 19.75561 -0.482 0.6300
## month(optin_time, label = TRUE)^6 -38.58033 19.65726 -1.963 0.0501 .
## month(optin_time, label = TRUE)^7 -12.22472 20.44351 -0.598 0.5501
## month(optin_time, label = TRUE)^8 -19.82784 20.91266 -0.948 0.3434
## month(optin_time, label = TRUE)^9 2.47483 21.09561 0.117 0.9066
## month(optin_time, label = TRUE)^10 -2.43759 21.57412 -0.113 0.9101
## month(optin_time, label = TRUE)^11 -4.83019 19.92557 -0.242 0.8085
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 140 on 615 degrees of freedom
## Multiple R-squared: 0.04065, Adjusted R-squared: 0.01257
## F-statistic: 1.448 on 18 and 615 DF, p-value: 0.1032
## The following code will return error
# subscribers_daily_tbl %>%
# plot_time_series(optin_time, log(optins))
# Log Plus 1
subscribers_daily_tbl %>%
plot_time_series(optin_time, log1p(optins))
# Inversion
subscribers_daily_tbl %>%
plot_time_series(optin_time, log1p(optins) %>% expm1())
# Benefit
subscribers_daily_tbl %>%
plot_time_series_regression(
.date_var = optin_time,
.formula = log1p(optins) ~ as.numeric(optin_time) +
wday(optin_time, label = TRUE) +
month(optin_time, label = TRUE),
.show_summary = TRUE
)
##
## Call:
## stats::lm(formula = .formula, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6906 -0.4537 -0.0482 0.4072 5.3262
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.044e+01 3.364e+00 -14.994 < 2e-16 ***
## as.numeric(optin_time) 2.965e-03 1.868e-04 15.871 < 2e-16 ***
## wday(optin_time, label = TRUE).L -1.664e-01 8.371e-02 -1.988 0.047209 *
## wday(optin_time, label = TRUE).Q -7.487e-01 8.391e-02 -8.922 < 2e-16 ***
## wday(optin_time, label = TRUE).C 3.820e-02 8.378e-02 0.456 0.648621
## wday(optin_time, label = TRUE)^4 -1.451e-02 8.369e-02 -0.173 0.862376
## wday(optin_time, label = TRUE)^5 8.348e-02 8.384e-02 0.996 0.319785
## wday(optin_time, label = TRUE)^6 -1.178e-01 8.399e-02 -1.403 0.161245
## month(optin_time, label = TRUE).L -3.101e-01 1.095e-01 -2.832 0.004784 **
## month(optin_time, label = TRUE).Q 4.351e-01 1.126e-01 3.863 0.000124 ***
## month(optin_time, label = TRUE).C 1.380e-01 1.147e-01 1.203 0.229248
## month(optin_time, label = TRUE)^4 -5.089e-01 1.107e-01 -4.598 5.19e-06 ***
## month(optin_time, label = TRUE)^5 2.928e-01 1.125e-01 2.603 0.009451 **
## month(optin_time, label = TRUE)^6 6.176e-03 1.119e-01 0.055 0.956014
## month(optin_time, label = TRUE)^7 -3.219e-01 1.164e-01 -2.765 0.005860 **
## month(optin_time, label = TRUE)^8 -2.172e-01 1.191e-01 -1.824 0.068574 .
## month(optin_time, label = TRUE)^9 2.509e-01 1.201e-01 2.089 0.037101 *
## month(optin_time, label = TRUE)^10 5.022e-02 1.228e-01 0.409 0.682831
## month(optin_time, label = TRUE)^11 -4.688e-01 1.135e-01 -4.132 4.10e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7971 on 615 degrees of freedom
## Multiple R-squared: 0.4693, Adjusted R-squared: 0.4538
## F-statistic: 30.22 on 18 and 615 DF, p-value: < 2.2e-16
data_prepared_tbl <- subscribers_daily_tbl %>%
# Preprocessing
mutate(optins_trans = log_interval_vec(optins, limit_lower = 0, offset = 1)) %>%
mutate(optins_trans = standardize_vec(optins_trans)) %>%
# Fix missing values at beginning of series
filter_by_time(.start_date = "2018-07-03") %>%
# Cleaning
# replacing the outlier with the cleaned, the red plot during
# the time period specified within between_time()
mutate(optins_trans_cleaned = ts_clean_vec(optins_trans, period = 7)) %>%
mutate(optins_trans = ifelse(optin_time %>% between_time("2018-11-18", "2018-11-20"),
optins_trans_cleaned,
optins_trans)) %>%
select(-optins, -optins_trans_cleaned)
## log_interval_vec():
## Using limit_lower: 0
## Using limit_upper: 3650.8
## Using offset: 1
## Standardization Parameters
## mean: -5.25529020756467
## standard deviation: 1.1109817111334
## .date_var is missing. Using: optin_time
data_prepared_tbl %>%
pivot_longer(contains("trans")) %>%
plot_time_series(optin_time, value, name)
#################################
# Save Key Params
# We need them to convert them back to
# original scale
##################################
limit_lower <- 0
limit_upper <- 3650.8
offset <- 1
std_mean <- -5.25529020756467
std_sd <- 1.1109817111334
full dataset#prdiction horizon
horizon <- 8*7
#M5 Competition, feature engineering was
#critical to success and this is something
#that M5 Competition winner did
#used to create rolling averages
#engineered features
#8 weeks and 7 days per week
lag_period <- 8*7
rolling_periods <- c(30, 60, 90)
trend() in
TSLM.data_prepared_full_tbl <- data_prepared_tbl %>%
# Add future window
bind_rows(
future_frame(.data = ., .date_var = optin_time, .length_out = horizon)
) %>%
# Add Autocorrelated Lags
tk_augment_lags(optins_trans, .lags = lag_period) %>%
# Add rolling features
tk_augment_slidify(
.value = optins_trans_lag56,
.f = mean,
.period = rolling_periods,
.align = "center",
.partial = TRUE
)
data_prepared_full_tbl %>% pivot_longer(-optin_time) %>%
plot_time_series(.date_var = optin_time,
value, name, .smooth= FALSE)
data_prepared_full_tbl %>% head() %>%
kable("html") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
| optin_time | optins_trans | optins_trans_lag56 | optins_trans_lag56_roll_30 | optins_trans_lag56_roll_60 | optins_trans_lag56_roll_90 |
|---|---|---|---|---|---|
| 2018-07-03 | -0.4919060 | NA | NA | NA | NA |
| 2018-07-04 | -0.1534053 | NA | NA | NA | NA |
| 2018-07-05 | -0.5779424 | NA | NA | NA | NA |
| 2018-07-06 | -0.4133393 | NA | NA | NA | NA |
| 2018-07-07 | -1.2030828 | NA | NA | NA | NA |
| 2018-07-08 | -1.6633730 | NA | NA | NA | NA |
data_prepared_full_tbl %>% tail() %>%
kable("html") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
| optin_time | optins_trans | optins_trans_lag56 | optins_trans_lag56_roll_30 | optins_trans_lag56_roll_60 | optins_trans_lag56_roll_90 |
|---|---|---|---|---|---|
| 2020-04-22 | NA | 1.7391664 | 1.081005 | 0.7841984 | 0.6612021 |
| 2020-04-23 | NA | 2.5450484 | 1.117631 | 0.8045394 | 0.6618256 |
| 2020-04-24 | NA | 1.5884589 | 1.089140 | 0.8163169 | 0.6675904 |
| 2020-04-25 | NA | 0.4158820 | 1.087291 | 0.8376167 | 0.6723126 |
| 2020-04-26 | NA | 0.5295369 | 1.102533 | 0.8554082 | 0.6839993 |
| 2020-04-27 | NA | -0.3410451 | 1.093397 | 0.8976997 | 0.6936319 |
data_prepared_full_tbl %>% tail(57)
## # A tibble: 57 × 6
## optin_time optins_trans optins_trans_lag56 optins_trans_lag56_roll_30
## <date> <dbl> <dbl> <dbl>
## 1 2020-03-02 -0.341 1.71 0.387
## 2 2020-03-03 NA 1.06 0.443
## 3 2020-03-04 NA 2.07 0.463
## 4 2020-03-05 NA 1.36 0.509
## 5 2020-03-06 NA 0.251 0.524
## 6 2020-03-07 NA -0.779 0.534
## 7 2020-03-08 NA 0.0926 0.556
## 8 2020-03-09 NA 0.631 0.540
## 9 2020-03-10 NA 0.385 0.521
## 10 2020-03-11 NA 0.446 0.511
## # ℹ 47 more rows
## # ℹ 2 more variables: optins_trans_lag56_roll_60 <dbl>,
## # optins_trans_lag56_roll_90 <dbl>
#will be used to create train and test
data_prepared_tbl <- data_prepared_full_tbl %>%
filter(!is.na(optins_trans))
data_prepared_tbl
## # A tibble: 609 × 6
## optin_time optins_trans optins_trans_lag56 optins_trans_lag56_roll_30
## <date> <dbl> <dbl> <dbl>
## 1 2018-07-03 -0.492 NA NA
## 2 2018-07-04 -0.153 NA NA
## 3 2018-07-05 -0.578 NA NA
## 4 2018-07-06 -0.413 NA NA
## 5 2018-07-07 -1.20 NA NA
## 6 2018-07-08 -1.66 NA NA
## 7 2018-07-09 -0.274 NA NA
## 8 2018-07-10 -0.212 NA NA
## 9 2018-07-11 -0.0986 NA NA
## 10 2018-07-12 -0.274 NA NA
## # ℹ 599 more rows
## # ℹ 2 more variables: optins_trans_lag56_roll_60 <dbl>,
## # optins_trans_lag56_roll_90 <dbl>
#will be used to make forecast
forecast_tbl <- data_prepared_full_tbl %>%
filter(is.na(optins_trans))
forecast_tbl
## # A tibble: 56 × 6
## optin_time optins_trans optins_trans_lag56 optins_trans_lag56_roll_30
## <date> <dbl> <dbl> <dbl>
## 1 2020-03-03 NA 1.06 0.443
## 2 2020-03-04 NA 2.07 0.463
## 3 2020-03-05 NA 1.36 0.509
## 4 2020-03-06 NA 0.251 0.524
## 5 2020-03-07 NA -0.779 0.534
## 6 2020-03-08 NA 0.0926 0.556
## 7 2020-03-09 NA 0.631 0.540
## 8 2020-03-10 NA 0.385 0.521
## 9 2020-03-11 NA 0.446 0.511
## 10 2020-03-12 NA 0.135 0.524
## # ℹ 46 more rows
## # ℹ 2 more variables: optins_trans_lag56_roll_60 <dbl>,
## # optins_trans_lag56_roll_90 <dbl>
#make the assess equal to your forecasting period
#Cumulative = TRUE uses all of the previous data in the dataset
splits <- time_series_split(data_prepared_tbl, assess = horizon, cumulative = TRUE)
## Using date_var: optin_time
splits %>%
tk_time_series_cv_plan() %>%
plot_time_series_cv_plan(optin_time, optins_trans)
It is important to try different feature engineering sets.
recipe() defines the data preprocessing
operations.
recipes are data dependent. New data must have the same
column names and classes for the recipe to be applied.Feature engineering is the most critical part of time series analysis.
Multiple recipes
model-specific recipes that modify the
base.
spline model in the example below will use natural
splines to model trend.lag model in the example below will use
Lag + Rolling features to model trend.Many ML model will return error if you fed it with time format data.
preprocessing step to generate the time series
signature features.matches() is a tidyselect helper that allows us to use
Regular Expressions (RegEx) to select column names
() to create multi-regex search patternsrecipe is equivalent to
standardize_vec()timeTK author calls normalization process
as standardizationnormalize_vec()performs categorical encoding for either dummy encoding or one-hot encoding.
Not all ML modles handles categorical data in this way. So need to perform preprocessing to be on the safe side.
all_nominal() a recipe column selector that selects
any columns that are categorical.
all_numeric() and
all_predictors()recipe_spec_base <- recipe(optins_trans ~ ., data = training(splits)) %>%
# Time Series Signature
step_timeseries_signature(optin_time) %>%
step_rm(matches("(iso)|(xts)|(hour)|(minute)|(second)|(am.pm)")) %>%
# Standardization
step_normalize(matches("(index.num)|(year)|(yday)")) %>%
# Dummy Encoding (One Hot Encoding)
step_dummy(all_nominal(), one_hot = TRUE) %>%
# Interaction
step_interact(~ matches("week2") * matches("wday.lbl")) %>%
# Fourier
step_fourier(optin_time, period = c(7, 14, 30, 90, 365), K = 2)
# juice() takes the training dataset out
recipe_spec_base %>% prep() %>% juice() %>% glimpse()
## Rows: 553
## Columns: 68
## $ optin_time <date> 2018-07-03, 2018-07-04, 2018…
## $ optins_trans_lag56 <dbl> NA, NA, NA, NA, NA, NA, NA, N…
## $ optins_trans_lag56_roll_30 <dbl> NA, NA, NA, NA, NA, NA, NA, N…
## $ optins_trans_lag56_roll_60 <dbl> NA, NA, NA, NA, NA, NA, NA, N…
## $ optins_trans_lag56_roll_90 <dbl> NA, NA, NA, NA, NA, NA, NA, N…
## $ optins_trans <dbl> -0.49190597, -0.15340526, -0.…
## $ optin_time_index.num <dbl> -1.727358, -1.721099, -1.7148…
## $ optin_time_year <dbl> -1.394192, -1.394192, -1.3941…
## $ optin_time_half <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
## $ optin_time_quarter <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
## $ optin_time_month <int> 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,…
## $ optin_time_day <int> 3, 4, 5, 6, 7, 8, 9, 10, 11, …
## $ optin_time_wday <int> 3, 4, 5, 6, 7, 1, 2, 3, 4, 5,…
## $ optin_time_mday <int> 3, 4, 5, 6, 7, 8, 9, 10, 11, …
## $ optin_time_qday <int> 3, 4, 5, 6, 7, 8, 9, 10, 11, …
## $ optin_time_yday <dbl> -0.26427519, -0.25454717, -0.…
## $ optin_time_mweek <int> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,…
## $ optin_time_week <int> 27, 27, 27, 27, 27, 27, 28, 2…
## $ optin_time_week2 <int> 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,…
## $ optin_time_week3 <int> 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,…
## $ optin_time_week4 <int> 3, 3, 3, 3, 3, 3, 0, 0, 0, 0,…
## $ optin_time_mday7 <int> 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,…
## $ optin_time_month.lbl_01 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_02 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_03 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_04 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_05 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_06 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_07 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ optin_time_month.lbl_08 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_09 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_10 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_11 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_12 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_wday.lbl_1 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,…
## $ optin_time_wday.lbl_2 <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,…
## $ optin_time_wday.lbl_3 <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,…
## $ optin_time_wday.lbl_4 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ optin_time_wday.lbl_5 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,…
## $ optin_time_wday.lbl_6 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
## $ optin_time_wday.lbl_7 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_1 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_3 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_4 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_5 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_6 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_7 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,…
## $ optin_time_sin7_K1 <dbl> -9.749279e-01, -7.818315e-01,…
## $ optin_time_cos7_K1 <dbl> -0.2225209, 0.6234898, 1.0000…
## $ optin_time_sin7_K2 <dbl> 4.338837e-01, -9.749279e-01, …
## $ optin_time_cos7_K2 <dbl> -0.9009689, -0.2225209, 1.000…
## $ optin_time_sin14_K1 <dbl> 7.818315e-01, 4.338837e-01, -…
## $ optin_time_cos14_K1 <dbl> -0.6234898, -0.9009689, -1.00…
## $ optin_time_sin14_K2 <dbl> -9.749279e-01, -7.818315e-01,…
## $ optin_time_cos14_K2 <dbl> -0.2225209, 0.6234898, 1.0000…
## $ optin_time_sin30_K1 <dbl> 1.126516e-13, -2.079117e-01, …
## $ optin_time_cos30_K1 <dbl> -1.0000000, -0.9781476, -0.91…
## $ optin_time_sin30_K2 <dbl> -2.253032e-13, 4.067366e-01, …
## $ optin_time_cos30_K2 <dbl> 1.0000000, 0.9135455, 0.66913…
## $ optin_time_sin90_K1 <dbl> -8.660254e-01, -8.290376e-01,…
## $ optin_time_cos90_K1 <dbl> 0.5000000, 0.5591929, 0.61566…
## $ optin_time_sin90_K2 <dbl> -8.660254e-01, -9.271839e-01,…
## $ optin_time_cos90_K2 <dbl> -0.5000000, -0.3746066, -0.24…
## $ optin_time_sin365_K1 <dbl> -0.2135209, -0.2303057, -0.24…
## $ optin_time_cos365_K1 <dbl> -0.9769385, -0.9731183, -0.96…
## $ optin_time_sin365_K2 <dbl> 0.4171936, 0.4482293, 0.47873…
## $ optin_time_cos365_K2 <dbl> 0.9088176, 0.8939186, 0.87796…
#taking optin_time which is date time format out
#it still has index.num information in it.
#it is also taking lag related features out
recipe_spec_1 <- recipe_spec_base %>%
step_rm(optin_time) %>%
#adds a step for a natural spline transformation
step_ns(ends_with("index.num"), deg_free = 2) %>%
step_rm(starts_with("lag_"))
recipe_spec_base %>% prep() %>% juice() %>% glimpse()
## Rows: 553
## Columns: 68
## $ optin_time <date> 2018-07-03, 2018-07-04, 2018…
## $ optins_trans_lag56 <dbl> NA, NA, NA, NA, NA, NA, NA, N…
## $ optins_trans_lag56_roll_30 <dbl> NA, NA, NA, NA, NA, NA, NA, N…
## $ optins_trans_lag56_roll_60 <dbl> NA, NA, NA, NA, NA, NA, NA, N…
## $ optins_trans_lag56_roll_90 <dbl> NA, NA, NA, NA, NA, NA, NA, N…
## $ optins_trans <dbl> -0.49190597, -0.15340526, -0.…
## $ optin_time_index.num <dbl> -1.727358, -1.721099, -1.7148…
## $ optin_time_year <dbl> -1.394192, -1.394192, -1.3941…
## $ optin_time_half <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
## $ optin_time_quarter <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
## $ optin_time_month <int> 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,…
## $ optin_time_day <int> 3, 4, 5, 6, 7, 8, 9, 10, 11, …
## $ optin_time_wday <int> 3, 4, 5, 6, 7, 1, 2, 3, 4, 5,…
## $ optin_time_mday <int> 3, 4, 5, 6, 7, 8, 9, 10, 11, …
## $ optin_time_qday <int> 3, 4, 5, 6, 7, 8, 9, 10, 11, …
## $ optin_time_yday <dbl> -0.26427519, -0.25454717, -0.…
## $ optin_time_mweek <int> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,…
## $ optin_time_week <int> 27, 27, 27, 27, 27, 27, 28, 2…
## $ optin_time_week2 <int> 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,…
## $ optin_time_week3 <int> 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,…
## $ optin_time_week4 <int> 3, 3, 3, 3, 3, 3, 0, 0, 0, 0,…
## $ optin_time_mday7 <int> 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,…
## $ optin_time_month.lbl_01 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_02 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_03 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_04 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_05 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_06 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_07 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ optin_time_month.lbl_08 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_09 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_10 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_11 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_12 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_wday.lbl_1 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,…
## $ optin_time_wday.lbl_2 <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,…
## $ optin_time_wday.lbl_3 <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,…
## $ optin_time_wday.lbl_4 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ optin_time_wday.lbl_5 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,…
## $ optin_time_wday.lbl_6 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
## $ optin_time_wday.lbl_7 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_1 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_3 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_4 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_5 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_6 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_7 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,…
## $ optin_time_sin7_K1 <dbl> -9.749279e-01, -7.818315e-01,…
## $ optin_time_cos7_K1 <dbl> -0.2225209, 0.6234898, 1.0000…
## $ optin_time_sin7_K2 <dbl> 4.338837e-01, -9.749279e-01, …
## $ optin_time_cos7_K2 <dbl> -0.9009689, -0.2225209, 1.000…
## $ optin_time_sin14_K1 <dbl> 7.818315e-01, 4.338837e-01, -…
## $ optin_time_cos14_K1 <dbl> -0.6234898, -0.9009689, -1.00…
## $ optin_time_sin14_K2 <dbl> -9.749279e-01, -7.818315e-01,…
## $ optin_time_cos14_K2 <dbl> -0.2225209, 0.6234898, 1.0000…
## $ optin_time_sin30_K1 <dbl> 1.126516e-13, -2.079117e-01, …
## $ optin_time_cos30_K1 <dbl> -1.0000000, -0.9781476, -0.91…
## $ optin_time_sin30_K2 <dbl> -2.253032e-13, 4.067366e-01, …
## $ optin_time_cos30_K2 <dbl> 1.0000000, 0.9135455, 0.66913…
## $ optin_time_sin90_K1 <dbl> -8.660254e-01, -8.290376e-01,…
## $ optin_time_cos90_K1 <dbl> 0.5000000, 0.5591929, 0.61566…
## $ optin_time_sin90_K2 <dbl> -8.660254e-01, -9.271839e-01,…
## $ optin_time_cos90_K2 <dbl> -0.5000000, -0.3746066, -0.24…
## $ optin_time_sin365_K1 <dbl> -0.2135209, -0.2303057, -0.24…
## $ optin_time_cos365_K1 <dbl> -0.9769385, -0.9731183, -0.96…
## $ optin_time_sin365_K2 <dbl> 0.4171936, 0.4482293, 0.47873…
## $ optin_time_cos365_K2 <dbl> 0.9088176, 0.8939186, 0.87796…
recipe_spec_2 <- recipe_spec_base %>%
step_rm(optin_time) %>%
#removes rows with missing value based on the values stored
#under the column names that starts with lag_
step_naomit(starts_with("lag_"))
recipe_spec_2 %>% prep() %>% juice() %>% glimpse()
## Rows: 553
## Columns: 67
## $ optins_trans_lag56 <dbl> NA, NA, NA, NA, NA, NA, NA, N…
## $ optins_trans_lag56_roll_30 <dbl> NA, NA, NA, NA, NA, NA, NA, N…
## $ optins_trans_lag56_roll_60 <dbl> NA, NA, NA, NA, NA, NA, NA, N…
## $ optins_trans_lag56_roll_90 <dbl> NA, NA, NA, NA, NA, NA, NA, N…
## $ optins_trans <dbl> -0.49190597, -0.15340526, -0.…
## $ optin_time_index.num <dbl> -1.727358, -1.721099, -1.7148…
## $ optin_time_year <dbl> -1.394192, -1.394192, -1.3941…
## $ optin_time_half <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
## $ optin_time_quarter <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
## $ optin_time_month <int> 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,…
## $ optin_time_day <int> 3, 4, 5, 6, 7, 8, 9, 10, 11, …
## $ optin_time_wday <int> 3, 4, 5, 6, 7, 1, 2, 3, 4, 5,…
## $ optin_time_mday <int> 3, 4, 5, 6, 7, 8, 9, 10, 11, …
## $ optin_time_qday <int> 3, 4, 5, 6, 7, 8, 9, 10, 11, …
## $ optin_time_yday <dbl> -0.26427519, -0.25454717, -0.…
## $ optin_time_mweek <int> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,…
## $ optin_time_week <int> 27, 27, 27, 27, 27, 27, 28, 2…
## $ optin_time_week2 <int> 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,…
## $ optin_time_week3 <int> 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,…
## $ optin_time_week4 <int> 3, 3, 3, 3, 3, 3, 0, 0, 0, 0,…
## $ optin_time_mday7 <int> 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,…
## $ optin_time_month.lbl_01 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_02 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_03 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_04 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_05 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_06 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_07 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ optin_time_month.lbl_08 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_09 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_10 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_11 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_12 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_wday.lbl_1 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,…
## $ optin_time_wday.lbl_2 <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,…
## $ optin_time_wday.lbl_3 <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,…
## $ optin_time_wday.lbl_4 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ optin_time_wday.lbl_5 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,…
## $ optin_time_wday.lbl_6 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
## $ optin_time_wday.lbl_7 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_1 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_3 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_4 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_5 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_6 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_7 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,…
## $ optin_time_sin7_K1 <dbl> -9.749279e-01, -7.818315e-01,…
## $ optin_time_cos7_K1 <dbl> -0.2225209, 0.6234898, 1.0000…
## $ optin_time_sin7_K2 <dbl> 4.338837e-01, -9.749279e-01, …
## $ optin_time_cos7_K2 <dbl> -0.9009689, -0.2225209, 1.000…
## $ optin_time_sin14_K1 <dbl> 7.818315e-01, 4.338837e-01, -…
## $ optin_time_cos14_K1 <dbl> -0.6234898, -0.9009689, -1.00…
## $ optin_time_sin14_K2 <dbl> -9.749279e-01, -7.818315e-01,…
## $ optin_time_cos14_K2 <dbl> -0.2225209, 0.6234898, 1.0000…
## $ optin_time_sin30_K1 <dbl> 1.126516e-13, -2.079117e-01, …
## $ optin_time_cos30_K1 <dbl> -1.0000000, -0.9781476, -0.91…
## $ optin_time_sin30_K2 <dbl> -2.253032e-13, 4.067366e-01, …
## $ optin_time_cos30_K2 <dbl> 1.0000000, 0.9135455, 0.66913…
## $ optin_time_sin90_K1 <dbl> -8.660254e-01, -8.290376e-01,…
## $ optin_time_cos90_K1 <dbl> 0.5000000, 0.5591929, 0.61566…
## $ optin_time_sin90_K2 <dbl> -8.660254e-01, -9.271839e-01,…
## $ optin_time_cos90_K2 <dbl> -0.5000000, -0.3746066, -0.24…
## $ optin_time_sin365_K1 <dbl> -0.2135209, -0.2303057, -0.24…
## $ optin_time_cos365_K1 <dbl> -0.9769385, -0.9731183, -0.96…
## $ optin_time_sin365_K2 <dbl> 0.4171936, 0.4482293, 0.47873…
## $ optin_time_cos365_K2 <dbl> 0.9088176, 0.8939186, 0.87796…
model_spec_lm <- linear_reg() %>%
set_engine("lm")
model and recipe
objectsworkflow_fit_lm_1_spline <- workflow() %>%
add_model(model_spec_lm) %>%
add_recipe(recipe_spec_1) %>%
fit(training(splits))
model_tbl <- modeltime_table(
workflow_fit_lm_1_spline
)
calibration_tbl <- model_tbl %>%
modeltime_calibrate(new_data = testing(splits))
## Warning: There was 1 warning in `dplyr::mutate()`.
## ℹ In argument: `.nested.col = purrr::map2(...)`.
## Caused by warning in `predict.lm()`:
## ! prediction from rank-deficient fit; consider predict(., rankdeficient="NA")
calibration_tbl %>% modeltime_accuracy()
## # A tibble: 1 × 9
## .model_id .model_desc .type mae mape mase smape rmse rsq
## <int> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 LM Test 0.670 873. 0.907 128. 0.917 0.184
calibration_tbl %>%
modeltime_forecast(new_data = testing(splits),
actual_data = data_prepared_tbl) %>%
plot_modeltime_forecast()
## Warning: There was 1 warning in `dplyr::mutate()`.
## ℹ In argument: `.nested.col = purrr::map2(...)`.
## Caused by warning in `predict.lm()`:
## ! prediction from rank-deficient fit; consider predict(., rankdeficient="NA")
workflow_fit_lm_2_lag <- workflow() %>%
add_model(model_spec_lm) %>%
add_recipe(recipe_spec_2) %>%
fit(training(splits))
modeltime_tbl <- modeltime_table(
workflow_fit_lm_1_spline,
workflow_fit_lm_2_lag
)
#create prediction interval for two models
calibration_tbl <- modeltime_tbl %>%
modeltime_calibrate(new_data = testing(splits))
## Warning: There were 2 warnings in `dplyr::mutate()`.
## The first warning was:
## ℹ In argument: `.nested.col = purrr::map2(...)`.
## Caused by warning in `predict.lm()`:
## ! prediction from rank-deficient fit; consider predict(., rankdeficient="NA")
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
#check the model performance
calibration_tbl %>% modeltime_accuracy()
## # A tibble: 2 × 9
## .model_id .model_desc .type mae mape mase smape rmse rsq
## <int> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 LM Test 0.670 873. 0.907 128. 0.917 0.184
## 2 2 LM Test 0.761 875. 1.03 142. 0.999 0.245
#plot them
calibration_tbl %>%
modeltime_forecast(new_data = testing(splits),
actual_data = data_prepared_tbl) %>%
plot_modeltime_forecast()
## Warning: There was 1 warning in `dplyr::mutate()`.
## ℹ In argument: `.nested.col = purrr::map2(...)`.
## Caused by warning in `predict.lm()`:
## ! prediction from rank-deficient fit; consider predict(., rankdeficient="NA")
## Warning: There was 1 warning in `dplyr::mutate()`.
## ℹ In argument: `.nested.col = purrr::map2(...)`.
## Caused by warning in `predict.lm()`:
## ! prediction from rank-deficient fit; consider predict(., rankdeficient="NA")
refit_tbl <- calibration_tbl %>%
modeltime_refit(data = data_prepared_tbl)
refit_tbl %>%
modeltime_forecast(new_data = forecast_tbl,
actual_data = data_prepared_tbl) %>%
# Invert Transformation
mutate(across(.value:.conf_hi, .fns = ~ standardize_inv_vec(
x = .,
mean = std_mean,
sd = std_sd
))) %>%
mutate(across(.value:.conf_hi, .fns = ~ log_interval_inv_vec(
x = .,
limit_lower = limit_lower,
limit_upper = limit_upper,
offset = offset
))) %>%
plot_modeltime_forecast()
## Warning: There was 1 warning in `dplyr::mutate()`.
## ℹ In argument: `.nested.col = purrr::map2(...)`.
## Caused by warning in `predict.lm()`:
## ! prediction from rank-deficient fit; consider predict(., rankdeficient="NA")
## There was 1 warning in `dplyr::mutate()`.
## ℹ In argument: `.nested.col = purrr::map2(...)`.
## Caused by warning in `predict.lm()`:
## ! prediction from rank-deficient fit; consider predict(., rankdeficient="NA")
feature_engineering_artifacts_list <- list(
# Data
data = list(
data_prepared_tbl = data_prepared_tbl,
forecast_tbl = forecast_tbl
),
# Recipes
recipes = list(
recipe_spec_base = recipe_spec_base,
recipe_spec_1 = recipe_spec_1,
recipe_spec_2 = recipe_spec_2
),
# Models / Workflows
models = list(
workflow_fit_lm_1_spline = workflow_fit_lm_1_spline,
workflow_fit_lm_2_lag = workflow_fit_lm_2_lag
),
# Inversion Parameters
standardize = list(
std_mean = std_mean,
std_sd = std_sd
),
log_interval = list(
limit_lower = limit_lower,
limit_upper = limit_upper,
offset = offset
)
)
feature_engineering_artifacts_list
## $data
## $data$data_prepared_tbl
## # A tibble: 609 × 6
## optin_time optins_trans optins_trans_lag56 optins_trans_lag56_roll_30
## <date> <dbl> <dbl> <dbl>
## 1 2018-07-03 -0.492 NA NA
## 2 2018-07-04 -0.153 NA NA
## 3 2018-07-05 -0.578 NA NA
## 4 2018-07-06 -0.413 NA NA
## 5 2018-07-07 -1.20 NA NA
## 6 2018-07-08 -1.66 NA NA
## 7 2018-07-09 -0.274 NA NA
## 8 2018-07-10 -0.212 NA NA
## 9 2018-07-11 -0.0986 NA NA
## 10 2018-07-12 -0.274 NA NA
## # ℹ 599 more rows
## # ℹ 2 more variables: optins_trans_lag56_roll_60 <dbl>,
## # optins_trans_lag56_roll_90 <dbl>
##
## $data$forecast_tbl
## # A tibble: 56 × 6
## optin_time optins_trans optins_trans_lag56 optins_trans_lag56_roll_30
## <date> <dbl> <dbl> <dbl>
## 1 2020-03-03 NA 1.06 0.443
## 2 2020-03-04 NA 2.07 0.463
## 3 2020-03-05 NA 1.36 0.509
## 4 2020-03-06 NA 0.251 0.524
## 5 2020-03-07 NA -0.779 0.534
## 6 2020-03-08 NA 0.0926 0.556
## 7 2020-03-09 NA 0.631 0.540
## 8 2020-03-10 NA 0.385 0.521
## 9 2020-03-11 NA 0.446 0.511
## 10 2020-03-12 NA 0.135 0.524
## # ℹ 46 more rows
## # ℹ 2 more variables: optins_trans_lag56_roll_60 <dbl>,
## # optins_trans_lag56_roll_90 <dbl>
##
##
## $recipes
## $recipes$recipe_spec_base
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 5
##
## ── Operations
## • Timeseries signature features from: optin_time
## • Variables removed: matches("(iso)|(xts)|(hour)|(minute)|(second)|(am.pm)")
## • Centering and scaling for: matches("(index.num)|(year)|(yday)")
## • Dummy variables from: all_nominal()
## • Interactions with: matches("week2") * matches("wday.lbl")
## • Fourier series features from: optin_time
##
## $recipes$recipe_spec_1
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 5
##
## ── Operations
## • Timeseries signature features from: optin_time
## • Variables removed: matches("(iso)|(xts)|(hour)|(minute)|(second)|(am.pm)")
## • Centering and scaling for: matches("(index.num)|(year)|(yday)")
## • Dummy variables from: all_nominal()
## • Interactions with: matches("week2") * matches("wday.lbl")
## • Fourier series features from: optin_time
## • Variables removed: optin_time
## • Natural splines on: ends_with("index.num")
## • Variables removed: starts_with("lag_")
##
## $recipes$recipe_spec_2
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 5
##
## ── Operations
## • Timeseries signature features from: optin_time
## • Variables removed: matches("(iso)|(xts)|(hour)|(minute)|(second)|(am.pm)")
## • Centering and scaling for: matches("(index.num)|(year)|(yday)")
## • Dummy variables from: all_nominal()
## • Interactions with: matches("week2") * matches("wday.lbl")
## • Fourier series features from: optin_time
## • Variables removed: optin_time
## • Removing rows with NA values in: starts_with("lag_")
##
##
## $models
## $models$workflow_fit_lm_1_spline
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 9 Recipe Steps
##
## • step_timeseries_signature()
## • step_rm()
## • step_normalize()
## • step_dummy()
## • step_interact()
## • step_fourier()
## • step_rm()
## • step_ns()
## • step_rm()
##
## ── Model ───────────────────────────────────────────────────────────────────────
##
## Call:
## stats::lm(formula = ..y ~ ., data = data)
##
## Coefficients:
## (Intercept)
## -4.131e+02
## optins_trans_lag56
## -2.963e-02
## optins_trans_lag56_roll_30
## 7.936e-01
## optins_trans_lag56_roll_60
## -1.341e+00
## optins_trans_lag56_roll_90
## -1.182e+00
## optin_time_year
## 1.024e+00
## optin_time_half
## -5.678e-02
## optin_time_quarter
## 6.313e+01
## optin_time_month
## 2.349e+01
## optin_time_day
## 7.787e-01
## optin_time_wday
## -2.292e-01
## optin_time_mday
## NA
## optin_time_qday
## 6.941e-01
## optin_time_yday
## -1.563e+02
## optin_time_mweek
## -5.133e-02
## optin_time_week
## 4.380e-01
## optin_time_week2
## 3.117e-01
## optin_time_week3
## 1.782e-02
## optin_time_week4
## -1.144e-02
## optin_time_mday7
## -2.573e-02
## optin_time_month.lbl_01
## 1.398e+00
## optin_time_month.lbl_02
## 2.184e+00
## optin_time_month.lbl_03
##
## ...
## and 92 more lines.
##
## $models$workflow_fit_lm_2_lag
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 8 Recipe Steps
##
## • step_timeseries_signature()
## • step_rm()
## • step_normalize()
## • step_dummy()
## • step_interact()
## • step_fourier()
## • step_rm()
## • step_naomit()
##
## ── Model ───────────────────────────────────────────────────────────────────────
##
## Call:
## stats::lm(formula = ..y ~ ., data = data)
##
## Coefficients:
## (Intercept)
## -3.934e+02
## optins_trans_lag56
## -1.898e-02
## optins_trans_lag56_roll_30
## 1.189e+00
## optins_trans_lag56_roll_60
## -1.333e+00
## optins_trans_lag56_roll_90
## 1.054e+00
## optin_time_index.num
## -2.355e+02
## optin_time_year
## 2.628e+02
## optin_time_half
## -3.475e-02
## optin_time_quarter
## 5.936e+01
## optin_time_month
## 2.297e+01
## optin_time_day
## 7.589e-01
## optin_time_wday
## -2.277e-01
## optin_time_mday
## NA
## optin_time_qday
## 6.545e-01
## optin_time_yday
## NA
## optin_time_mweek
## -4.540e-02
## optin_time_week
## 4.645e-01
## optin_time_week2
## 2.713e-01
## optin_time_week3
## 1.398e-02
## optin_time_week4
## -1.267e-02
## optin_time_mday7
## -2.454e-02
## optin_time_month.lbl_01
## 1.347e+00
## optin_time_month.lbl_02
##
## ...
## and 90 more lines.
##
##
## $standardize
## $standardize$std_mean
## [1] -5.25529
##
## $standardize$std_sd
## [1] 1.110982
##
##
## $log_interval
## $log_interval$limit_lower
## [1] 0
##
## $log_interval$limit_upper
## [1] 3650.8
##
## $log_interval$offset
## [1] 1
feature_engineering_artifacts_list %>%
write_rds("model/feature_engineering_artifacts_list.rds")